# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（黑龙江）
#维护黄羽
'''

import re
import urllib2
from bs4 import BeautifulSoup
from utils import kill_captcha
import traceback
import requests
import copy
import random

from scpy.logger import get_logger
from scpy.xawesome_time import parse_time
import table
import sd_template_dict as TE
from get_page import *
import request_util

logger = get_logger(__file__)

ua = random.choice(request_util.USER_AGENTS)


def download_captcha_kill(companyName):
    """
    下载验证码，破解，然后搜索公司
    :param companyName:
    :return:若验证码破解成功且公司存在公司，返回公司网页。
            若公司不存在返回None
            若破解的验证码错误，返回''
            若破解过程、访问网页出现失败，抛出异样
    """
    # 下载验证码
    # type = 1 是成语，必须
    img_url = r'http://gsxt.hljaic.gov.cn/validateCode.jspx?type=1&id=0.618735927855596'
    img_headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'gsxt.hljaic.gov.cn',
        'Referer': 'http://gsxt.hljaic.gov.cn/search.jspx',
        'User-Agent': ua,
    }
    req = requests.session()
    req.headers = img_headers
    try:
        img = req.get(img_url, timeout=30).content
    except Exception, e:
        logger.error(e)
        raise e
    if not img:  # 判断下载的验证码是否正确
        return ''

    try:
        res_code = kill_captcha(img, "hlj", "jpg")
    except Exception, e:
        logger.error("破解验证码的服务，出现异常")
        logger.error(e)
        raise e

    if not res_code or len(res_code) > 100 or str(res_code) in ['None', 'wrong']:
        logger.info('验证码为:%s' % res_code)
        logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
        return ''  # 返回空字符串，用于重复破解
    else:
        logger.info('验证码为:%s' % res_code)

    # 获取公司基本信息的入口URL
    req_check_data = {
        'entName': companyName,
        'checkNo': res_code,
    }
    check_url = 'http://gsxt.hljaic.gov.cn/searchList.jspx'
    try:
        check_res = req.post(check_url, data=req_check_data, timeout=30).content
    except Exception, e:
        logger.error(e)
        raise e

    # 判断破解的验证码是否正确，如果验证码错误，工商网站是否返回，'验证码不正确或已失效！'的提示
    if re.compile('验证码不正确或已失效！').findall(check_res):
        return ''  # 返回空字符串，用于重复破解

    com_info_list = re.findall('''"(/businessPublicity\.jspx\?id=.*?)">''', check_res)
    if com_info_list:
        logger.info("搜索的公司存在！")
        return com_info_list[0]
    else:
        logger.info("搜索的公司不存在")
        return None  # 搜索的公司不存在


def get_company_info(com_info):
    """
    下载网页、年报网页
    :param com_info:首页的网页
    :return:公司源码字典
    """

    if not com_info:
        raise Exception("com_list 错误")
    raw_dict = {
        "province": "hlj",
        "type": "1",
        "html": "",
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_base_dict = {}
    root_url = 'http://gsxt.hljaic.gov.cn'
    req = requests.session()
    req.headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Upgrade-Insecure-Requests": "1",
        'Connection': 'keep-alive',
        'Host': 'gsxt.hljaic.gov.cn',
        'User-Agent': ua,
    }
    entId = re.compile(r'=(.+)').findall(com_info)
    if not entId:
        raise Exception("entId 错误")
    index_url = root_url + com_info

    logger.info("开始获取公司基本信息！")
    raw_base_html = req.get(index_url, timeout=30).content
    raw_base_dict["base"] = raw_base_html

    # 股东信息
    share_holder_url = 'http://gsxt.hljaic.gov.cn/QueryInvList.jspx'
    # mainId = entId
    # share_holder_page_data = {'pno': page, 'mainId': mainId}
    share_holder_page = 1
    share_more = []
    while True:
        logger.info("股东信息！")
        share_holder_res = req.get(share_holder_url, params={'pno': share_holder_page, 'mainId': entId},
                                   timeout=30).content
        share_holder_page += 1
        if share_more and share_more[-1] == share_holder_res or share_holder_page > 50:
            break
        else:
            share_more.append(share_holder_res)
        if len(re.findall("<tr>(.*?)</tr>", share_holder_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["share_more"] = share_more

    # 股东信息详情
    # TODO
    share_detail_url = "http://gsxt.hljaic.gov.cn/queryInvDetailAction.jspx"
    share_detail_list = []
    for a_share in share_more:
        trs = re.findall("<tr>(.*?)</tr>", a_share)
        for a_tr in trs:
            logger.info("股东信息详情！")
            share_id = re.findall('''/queryInvDetailAction\.jspx\?id=(.*?)\'''', a_tr)
            if share_id:
                share_detail_res = req.get(share_detail_url, params={"id": share_id[0]}, timeout=30).content
                share_detail_list.append(share_detail_res)
    raw_base_dict["share_detail"] = share_detail_list

    # 获取变更信息
    alter_url = 'http://gsxt.hljaic.gov.cn/QueryAltList.jspx'
    alter_page = 1
    alter_more = []
    while True:
        logger.info("变更信息！")
        alter_res = req.get(alter_url, params={'pno': alter_page, 'mainId': entId}, timeout=30).content
        alter_page += 1
        if alter_more and alter_more[-1] == alter_res or alter_page > 50:
            break
        else:
            alter_more.append(alter_res)
        if len(re.findall("<tr>(.*?)</tr>", alter_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["alter_more"] = alter_more

    # 第二页 备案信息
    # 主要人员信息
    mem_url = 'http://gsxt.hljaic.gov.cn/QueryMemList.jspx?'
    mem_page = 1
    mem_more = []
    while True:
        logger.info("主要人员信息！")
        mem_res = req.get(mem_url, params={'pno': mem_page, 'mainId': entId}, timeout=30).content
        print mem_res
        mem_page += 1
        if mem_more and mem_more[-1] == mem_res or mem_page > 50:
            break
        else:
            mem_more.append(mem_res)
        if len(re.findall("<tr>(.*?)</tr>", mem_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["mem_more"] = mem_more

    # 分支机构信息
    child_url = 'http://gsxt.hljaic.gov.cn/QueryChildList.jspx?'
    child_page = 1
    child_more = []
    while True:
        logger.info("分支机构信息！")
        child_res = req.get(child_url, params={'pno': child_page, 'mainId': entId}, timeout=30).content
        child_page += 1
        if child_more and child_more[-1] == child_res or child_page > 50:
            break
        else:
            child_more.append(child_res)
        if len(re.findall("<tr>(.*?)</tr>", child_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["child_more"] = child_more

    # 抽查检查
    check_url = "http://gsxt.hljaic.gov.cn/QuerySpotCheckList.jspx"
    check_page = 1
    check_more = []
    while True:
        logger.info("抽查检查信息！")
        check_res = req.get(check_url, params={'pno': check_page, 'mainId': entId}, timeout=30).content
        check_page += 1
        if check_more and check_more[-1] == check_res or check_page > 50:
            break
        else:
            check_more.append(check_res)
        if len(re.findall("<tr>(.*?)</tr>", check_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["check_more"] = check_more

    # 经营异常
    abnormal_url = "http://gsxt.hljaic.gov.cn/QueryExcList.jspx"
    abnormal_page = 1
    abnormal_more = []
    while True:
        logger.info("抽查检查信息！")
        abnormal_res = req.get(abnormal_url, params={'pno': abnormal_page, 'mainId': entId}, timeout=30).content
        abnormal_page += 1
        if abnormal_more and abnormal_more[-1] == abnormal_res or abnormal_page > 50:
            break
        else:
            abnormal_more.append(abnormal_res)
        if len(re.findall("<tr>(.*?)</tr>", abnormal_res)) < 5:
            # 插入之后再break
            break
    raw_base_dict["abnormal_more"] = abnormal_more

    raw_dict["html"] = raw_base_dict

    """
    年报
    """
    # 年报信息
    logger.info("开始获取公司年报！")
    year_index_url = "http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx"
    year_index_res = req.get(year_index_url, params={"id": entId}, timeout=30).content
    raw_base_dict["year_index"] = year_index_res
    raw_dict["html"] = raw_base_dict

    year_list_url = re.findall('''(/QueryYearExamineDetail.*?)".*?>(.*?)年度报告''', year_index_res)
    raw_year_html_list = []
    for a_year_url in year_list_url:
        if not a_year_url or len(a_year_url) != 2:
            continue
        raw_year_html_dict = {}
        logger.info("开始解析公司%s年报！", a_year_url[1])
        raw_year_html_dict["year"] = a_year_url[1]
        raw_year_html_dict["base"] = req.get(root_url + a_year_url[0], timeout=30).content

        raw_year_html_list.append(raw_year_html_dict)

    raw_dict["yearList"] = raw_year_html_list

    return raw_dict


def extract_base_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")

    # 基本信息
    raw_base = raw_html.get("base")
    raw_base_table = table.table_clean(raw_base, "基本信息")
    if not raw_base_table:
        raise Exception("基本信息错误")
    res_dict = copy.deepcopy(TE.void_base_dict)
    res_dict["basicList"] = table.index("基本信息", raw_base_table)
    res_dict["province"] = "hlj"

    # 股东信息
    raw_share_more = raw_html.get("share_more", [])
    share_holder_list_1 = []
    for a_share_more in raw_share_more:
        if a_share_more:
            trs = re.findall("(<tr.*?</tr>)", a_share_more, re.S)
            for a_tr in trs:
                tds = re.findall("<td.*?>(.*?)</td>", a_tr, re.S)
                if tds and len(tds) > 4:
                    tmp_share_dict = copy.deepcopy(TE.shareHolder_dict)
                    tmp_share_dict['shareholderName'] = tds[0]
                    tmp_share_dict['shareholderType'] = tds[3]
                    share_holder_list_1.append(tmp_share_dict)

    res_dict["shareHolderList"] = share_holder_list_1

    # 股东信息详情
    raw_share_detail = raw_html.get("share_detail", [])
    for a_share_detail in raw_share_detail:
        a_share_detail_table = re.findall("(<table.*?</table>)", a_share_detail, re.S)
        if a_share_detail_table:
            trs = re.findall("(<tr.*?</tr>)", a_share_detail_table[0], re.S)
            if trs and len(trs) > 3:
                tds = re.findall("<td.*?>(.*?)</td>", trs[3], re.S)
                if tds and len(tds) == 9:
                    shareHolder_dict = {
                        'shareholderName': tds[0],
                        'shareholderType': '',  # 股东类型
                        'country': '',  # 国别
                        'subConam': tds[1],  # 认缴出资额(单位:万元)
                        'regCapCur': '',  # 币种
                        'conDate': parse_time(tds[5]),  # 出资日期
                        'fundedRatio': '',  # 出资比例
                        # 'funded': '',
                    }
                    # 融合股东详情页和首页的股东信息
                    for iii, a_detail_1 in enumerate(share_holder_list_1):
                        if a_detail_1.get("shareholderName", "") == tds[0] and tds[0]:
                            shareHolder_dict["shareholderType"] = a_detail_1.get("shareholderType", "")
                            share_holder_list_1[iii] = copy.deepcopy(shareHolder_dict)
    res_dict["shareHolderList"] = share_holder_list_1

    # 变更信息
    raw_alter = raw_html.get("alter_more", [])
    alter_list = []
    for a_alter in raw_alter:
        alter_list = alter_list + table.index("变更信息", "<tr></tr>" * 2 + a_alter)
    res_dict["alterList"] = alter_list

    # 主要人员信息
    raw_person = raw_html.get("mem_more", [])
    person_list = []
    for a_person in raw_person:
        person_list = person_list + table.index("主要人员信息", a_person)
    # 主要人员去重
    person_list_2 = []
    for item in person_list:
        if item not in person_list_2:
            person_list_2.append(item)
    res_dict["personList"] = person_list_2

    # 分支机构
    raw_branch = raw_html.get("child_more", [])
    branch_list = []
    for a_branch in raw_branch:
        branch_list = branch_list + table.index("分支机构信息", "<tr></tr>" * 2 + a_branch)
    res_dict["filiationList"] = branch_list

    # 清算信息
    liquidation_table = table.table_clean(raw_base, "清算信息")
    res_dict["liquidationList"] = table.index("清算信息", liquidation_table) if liquidation_table else []

    # 经营异常信息
    raw_abnormal = raw_html.get("abnormal_more", [])
    abnormal_list = []
    for a_abnormal in raw_abnormal:
        abnormal_list = abnormal_list + table.index("经营异常信息", "<tr></tr>" * 2 + a_abnormal)
    res_dict["abnormalOperation"] = abnormal_list

    # 抽查检查信息
    raw_check = raw_html.get("check_more", [])
    check_list = []
    for a_check in raw_check:
        check_list = check_list + table.index("抽查检查信息", "<tr></tr>" * 2 + a_check)
    res_dict["checkMessage"] = check_list

    return res_dict


def extract_year_info(raw_dict):
    if not raw_dict:
        raise Exception("raw_dict 错误")

    raw_year_list = raw_dict.get("yearList", [])

    res_year_list = []

    for a_raw_year_item in raw_year_list:
        res_year_dict = copy.deepcopy(TE.void_year_dict)
        res_year_dict["year"] = a_raw_year_item.get("year", "")
        raw_year_base = a_raw_year_item.get("base", "")

        # 基本信息
        year_base_table = table.table_clean(raw_year_base, "企业基本信息") or table.table_clean(raw_year_base, "基本信息")
        res_year_dict["baseInfo"] = table.report_index("企业基本信息", year_base_table) if year_base_table else {}

        # 网站或网店信息
        year_web_table = table.table_clean(raw_year_base, "网站或网店信息")
        res_year_dict["website"] = table.report_index("网站或网店信息", year_web_table) if year_web_table else {}

        # 股东及出资信息
        year_share_table = table.table_clean(raw_year_base, "股东（发起人）及出资信息")
        res_year_dict["investorInformations"] = table.report_index("股东及出资信息",
                                                                   year_share_table) if year_share_table else []

        # 对外投资信息
        year_invest_table = table.table_clean(raw_year_base, "对外投资信息")
        res_year_dict["entinvItemList"] = table.report_index("对外投资信息", year_invest_table) if year_invest_table else []

        # 企业资产状况信息
        year_assets_table = table.table_clean(raw_year_base, "企业资产状况信息")
        res_year_dict["assetsInfo"] = table.report_index("企业资产状况信息", year_assets_table) if year_assets_table else {}

        # 股权变更信息
        year_equity_table = table.table_clean(raw_year_base, "股权变更信息")
        res_year_dict["equityChangeInformations"] = table.report_index("股权变更信息",
                                                                       year_equity_table) if year_equity_table else []

        # 修改记录
        year_change_table = table.table_clean(raw_year_base, "修改记录")
        res_year_dict["changeRecords"] = table.report_index("修改记录", year_change_table) if year_change_table else []

        res_year_list.append(res_year_dict)

    return res_year_list


def search2(companyName, MAXTIME=40):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                # time.sleep(10)
                res = download_captcha_kill(companyName)
                # print res
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    com_list = res
    res = get_company_info(com_list)
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        raw_dict = res
        try:
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)
            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': 'http://gsxt.hljaic.gov.cn',
                'method': 'get',
                'province': 'hlj',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method

        except Exception, e:
            logger.info(e)
            res['companyName'] = companyName
            gate_method = {
                'url': 'http://gsxt.hljaic.gov.cn',
                'method': 'get',
                'province': 'hlj',
                'companyName': companyName,
                'data': com_list,
            }
            return res, None, gate_method


def search(companyName):
    res = search2(companyName)
    if not res:
        return None
    else:
        return res[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')
    res = get_company_info(com_list)
    companyName = gate_method.get('companyName', '')

    raw_dict = res
    try:
        asic_dict = extract_base_info(raw_dict)
        year_list = extract_year_info(raw_dict)
        company_name = asic_dict['basicList'][0].get('enterpriseName', '')
        company_name = company_name if company_name else companyName
        res['companyName'] = company_name

        asic_dict['yearReportList'] = year_list
        gate_method = {
            'url': 'http://gsxt.hljaic.gov.cn',
            'method': 'get',
            'province': 'hlj',
            'companyName': company_name,
            'data': com_list,
        }

        return res, asic_dict, gate_method

    except Exception, e:
        logger.info(e)
        res['companyName'] = companyName
        gate_method = {
            'url': 'http://gsxt.hljaic.gov.cn',
            'method': 'get',
            'province': 'hlj',
            'companyName': companyName,
            'data': com_list,
        }
        return res, None, gate_method


if __name__ == "__main__":
    # companyName = u'黑龙江省电力有限公司'
    companyName = u'黑龙江恒隆房地产开发有限责任公司'
    # companyName = u'黑龙江省骄阳房地产经纪有限公司'
    # companyName = u'大兴安岭阳光燃气经销有限公司'
    # companyName = u'大兴安岭博安电子科技有限公司'
    # companyName = u'大兴安岭天利物业有限公司塔河物业管理处'
    # companyName = u'大兴安岭盛林土石方挖掘有限公司'
    # companyName = u'大兴安岭绿森营造林有限责任公司'
    # companyName = u'大兴安岭丽家物业管理有限公司'
    # companyName = u'大兴安岭宏盛纺织有限公司'
    # companyName = u'加格达奇区新兰天地网吧'
    # companyName = u'大兴安岭向国冷链物流有限公司'
    # companyName = u'黑龙江弘腾建筑工程有限公司加格达奇区分公司'
    # companyName = u'大庆市让胡路区唐国臣废品收购站'
    # companyName = u'大庆市让胡路区双兴棋牌室'
    # companyName = u'大庆市让胡路区刘发贤种植园'
    # companyName = u'大庆市让胡路区李传海养殖场'
    # companyName = u'五常市民意乡二胖修理部'
    # companyName = u'五常市韩式电热炕板商店'
    # companyName = u'大庆市让胡路区符俊行温室'
    # companyName = u'五常市丰年水稻种植农民专业合作社'
    # companyName = u'五常市东顺元超市'
    # companyName = u'五常市天缘食杂店'
    # companyName = u'五常市兴盛乡讯达通讯服务部'
    # companyName = u'五常市劝业场门口内西侧国才修表店'
    # companyName = u'五常市拉林镇北土村姗姗发廊'
    # companyName = u'五常市山河镇美美思美容美发店'
    # companyName = u'勃利县罗泉营业所'
    # companyName = u'勃利县煤炭工业总公司第九分公司'
    # companyName = u'勃利县龙达文化用品商店'


    # province = 'hlj'

    # result = search(companyName)
    result = search2(companyName)

    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
